\documentclass[12pt]{article}
\usepackage{setspace}
%\usepackage[numbers, sort&compress]{natbib}
\usepackage{natbib}
\usepackage{graphicx}
\usepackage{pslatex}
\usepackage{subfig}
\usepackage{float}
\usepackage{hyperref}
\usepackage{rotating}
\usepackage{lineno}
\usepackage{amsmath, amssymb, amsfonts,euscript,mathrsfs}
\usepackage{bm,bbm,latexsym,amsthm}
\usepackage{multirow}
\usepackage{psfig, epsfig, verbatim}
\usepackage[top=1.5in,bottom=1in,left=.5in,right=.5in]{geometry}
\usepackage{comment}


\newcommand{\PMTen}{PM$_{10}$ }
\newcommand{\PMTwo}{PM$_{2.5}$ }
\newcommand\independent{\protect\mathpalette{\protect\independenT}{\perp}} \def\independenT#1#2{\mathrel{\rlap{$#1#2$}\mkern2mu{#1#2}}}
\newcommand{\Xbar}{\bar{X}}
\newcommand{\Ozone}{O$_3$ }
\newcommand{\SOTwo}{SO$_2$ }
\newcommand{\NOx}{NO$_x$ }
\newcommand{\COTwo}{CO$_2$ }
\input{commands-book.tex}
\newtheorem{as}{{Assumption}}
\newtheorem{thm}{{\bf Theorem}}
\makeatletter 
\newcommand*{\rom}[1]{\expandafter\@slowromancap\romannumeral #1@} 
\makeatother 

\doublespacing


\title{Appendices for: Causal Inference Methods for Estimating Long Term Health Effects of Air Quality Regulations}

\date{}
\author{
Corwin Matthew Zigler\thanks{Harvard TH Chan School of Public Health}
\and Chanmin Kim $^*$
\and Christine Choirat \thanks{Harvard University}
\and John Barret Hansen$^\dagger$
\and Yun Wang $^*$
\and Lauren Hund \thanks {University of New Mexico}
\and Jonathan Samet\thanks{Keck School of Medicine, University of Southern California}
\and Gary King$^\dagger$
\and  Francesca Dominici$^*$
}

\begin{document}

\maketitle

\appendix

\newpage
\tableofcontents

\section{Technical Details for Case Study 1: \PMTen Nonattainment Designations}\label{app:pm10}
\subsection{Missing covariate data in the analysis of Case Study 1}\label{app:pm10:missing}
Rather than exclude observations with missing covariate data, we employ spatial hierarchical model to impute the missing 1990 ambient \PMTen measurements and use these imputations in our analysis.  Specifically, we fit the same type of spatial hierarchical model as detailed below, with the outcome specified as the log-transformed \PMTen concentration during the years 1990.  Covariates included in this model were the same as those in the model of the main text (See Table 1 of the main text), with the addition of a covariate denoting whether a location lies in an attainment or nonattainment area.   In total, ambient average \PMTen during 1990 was imputed for 131 nonattainment and 153 attainment locations in the entire (non-pruned) sample using posterior-predictive means from this model and treated as fixed covariates in the analysis.

\subsection{Locations of Discarded Monitoring Locations}\label{app:pm10:discard}
As discussed in the main text, the propensity score strategy identified 52 locations that did not ``overlap'' with locations in the opposite treatment group.  These locations were discarded for the analysis.  Figure \ref{maps} depicts the locations of all monitoring locations, the locations retained for the analysis, and the locations of those excluded due to lack of overlap.

\begin{figure}
\caption{Locations of all 547 \PMTen monitoring locations available for analysis and for the 495 locations retained after propensity score pruning}\label{maps}
\subfloat[Entire Monitor Set]{
\includegraphics[width=.5\textwidth]{Figures/monitormap_raw.pdf}\label{raw}}
\subfloat[Pruned Monitor Set]{
\includegraphics[width=.5\textwidth]{Figures/monitormap_pruned.pdf}\label{pruned}} \\
\subfloat[Excluded Monitor Set]{
\includegraphics[width=.5\textwidth]{Figures/monitormap_excluded.pdf}\label{excluded}}
\end{figure}


\subsection{Models for Air Pollution and Medicare Health Outcomes}\label{app:pm10:models}
\subsubsection{Notation and Ignorability Assumption}
For any hypothetical allocation of nonattainment designations to the $n=495$ pruned locations, let $\mathbf{A}\equiv[A(s_i)]_{i=1}^{n}$ be the vector of indicators denoting whether each of $n=495$ monitoring locations would fall in a nonattainment county, with $A(s_i)=1$ denoting nonattainment for the $i^{th}$ location, and $A(s_i)=1$ denoting attainment.  We refer to the entire vector $\mathbf{A}$ as a regulation program. We denote a specific regulation program with $\mathbf{A}=\mathbf{a}$, and the observed program representing the actual allocation of nonattainment designations with $\mathbf{A}=\mathbf{a^{obs}}$.  

Let $Y_\mathbf{a}(s)$ denote health outcome in 2001 (either all-cause mortality, CVD hospitalizations, or respiratory  hospitalizations) at location $s$ that would potentially occur under regulation program $\mathbf{A=a}$.  Let $X_\mathbf{a}(s)$ denote the average ambient \PMTen concentration that would potentially be observed during 1999-2001 under regulation program $\mathbf{A=a}$.  Note that the only observed potential outcomes are $(X_{\mathbf{a^{obs}}}(s), Y_{\mathbf{a^{obs}}}(s))$; all others are considered missing data. Let $Z(s)$ denote the vector of covariate values for location $s$ (i.e., those listed in Table 1 of the main text), and also assume that $Z(s)$ contains, in addition to those covariates, indicators of propensity score subclass membership.

We confine attention to the regulation programs $\mathbf{A=0}$ and $\mathbf{A=a^{obs}}$ and define the monitor-level causal effect of $\mathbf{A}$ on ambient \PMTen as the comparison between the potential pollution concentration of that pollutant under the observed nonattainment designations, $X_{a^{obs}}(s)$, and the potential concentration under the a setting with no nonattainment regulations, $X_0(s)$, among the nonattainment areas in the pruned data set.  We similarly define the causal effect of nonattainment designations on a given health outcome for location $s$ as the comparison between $Y_{a^{obs}}(s)$ and $Y_0(s)$.

We assume that assignment to the initial nonattaiment designations is strongly ignorable conditional on covariates.  In other words, there is no unmeasured confounding in the sense that $Z(s)$ contains all factors that tend to differ between nonattainment and attainment locations and that also impact potential pollution and Medicare health outcomes.


\subsubsection{Spatial Hierarchical Model for Air Pollution}
For $f(X_0(s),X_{a^{obs}}(s)|Z(s))$, we propose the following spatial hierarchical model: 
\begin{align}
X(s)=Z^T(s)\beta + W(s) + \epsilon(s), \label{lmcmodelpre}
\end{align}
where $X(s)= (X_0(s), X_{a^{obs}}(s))^T$ is the vector of potential pollution concentrations under each possible nonattainment status, $W(s)$ is a vector of spatially-varying random intercepts, and $\epsilon(s)$ represents nonspatial ``nugget'' error (e.g., measurement error).  We assume $\epsilon(s) \sim MVN(0, \Psi)$, and $\Psi$ diagonal.   $Z^T(s)$ is a $2 \times p$ matrix of time-invariant covariates, where $p$ is the number of covariates (including indicators of propensity score subclass) used to adjust for confounding. The analysis presented here assumes that, conditional on $Z(s)$, potential pollution concentrations under $\mathbf{A}=\mathbf{a^{obs}}$ and  $\mathbf{A}=\mathbf{0}$ are conditionally independent.  Details of the spatial correlation structure can be found in \cite{zigler_estimating_2012} and \cite{banerjee_gaussian_2008}.   


\subsubsection{Log-linear model for Mortality}
For $f(Y_0(s),Y_{a^{obs}}(s)|X_0(s),X_{a^{obs}}(s),Z(s))$, we make use of two additional assumptions. We assume conditional independence of potential health outcomes, conditional on covariates and air pollution: $Y_0(s) \independent Y_{a^{obs}}(s) \big| X_0(s), X_{a^{obs}}(s), Z(s)$.  We also assume that under a given designation, after conditioning on pollution under that designation (and covariates), health outcomes are independent of pollution under the opposite designation:  $f(Y_\mathbf{a}(s) \big| X_0(s), X_{a^{obs}}(s), Z(s)) =f(Y_\mathbf{a}(s) \big| X_\mathbf{a}(s), Z(s))$, for $\mathbf{a}=0,a^{obs}$.  This assumption reflects a belief that knowledge of both $(X_0(s),X_{a^{obs}}(s))$ does not contribute any information pertaining to $Y_\mathbf{a}(s)$ above and beyond that contained in $X_\mathbf{a}(s)$ alone.  As a result of these assumptions, we write $f({Y_0(s),Y_{a^{obs}}(s)|X_0(s),X_{a^{obs}}(s),Z(s)})=\prod f({Y_\mathbf{a}(s)|X_\mathbf{a}(s),Z(s)})$ for $\mathbf{a}=0,a^{obs}$, and model the terms of this product with the following log-linear models:
\begin{align}
\log(E[Y_\mathbf{a}(s)]) = \alpha^\mathbf{a}_{0} + Z^T(s)\alpha^\mathbf{a}_{1} + X_\mathbf{a}(s)\alpha^\mathbf{a}_{2} + \log(N(s)), \label{poismod}
\end{align}
where $\mathbf{a}=0,a^{obs}$, $N(s)$ is the total number of Medicare beneficiaries (for the mortality outcome) or person-years (for the hospitalization outcomes) living near location $s$ , $\alpha^\mathbf{a}_1$ captures relative risks associated with differences in $Z(s)$ under nonattainment program $\mathbf{A=a}$, and $\alpha^\mathbf{a}_2$ captures relative risks associated with differences in post-regulation ambient pollution concentrations under regulation program $\mathbf{A=a}$.


\subsection{Bayesian Estimation}\label{app:pm10:estimation}
Recall that $X(s)=(X_0(s), X_{a^{obs}}(s))^T$ and let $Y(s)=(Y_0(s), Y_{a^{obs}}(s))^T$.  The full joint density of the data can be written as:
\begin{align}
f(\mathbf{X, Y, Z}) =  \int \prod_{i=1}^n  f(Z(s_i),X(s_i),Y(s_i) \big|  \theta)p(\theta) d\theta, 
\end{align}
where $\theta$ is a generic parameter with prior distribution $p(\theta)$.  Distinguishing between the missing ($mis$) and observed ($obs$) quantities in $X(s)$ and $Y(s)$, the posterior distribution of $\theta$ is proportional to:
\begin{align}
p(\theta)& f(\mathbf{Z}) \int \int \prod_{i=1}^n f({X^{mis}(s_i), X^{obs}(s_i), Y^{mis}(s_i), Y^{obs}(s_i) \big| Z(s_i),\theta}) d{Y}^{mis}(s_i)d{X}^{mis}(s_i).  \label{bigjoint}
\end{align}
Inference from (\ref{bigjoint}) is difficult because of the integration over missing potential outcomes, leading us to focus instead on the following joint posterior distribution: 
\begin{align}
p(\theta, \mathbf{X^{mis}, \mathbf{Y^{mis}}} \big| \mathbf{X^{obs}, Y^{obs}, Z}) \propto p(\theta) \prod_{i=1}^{n} f(X^{mis}(s_i), X^{obs}(s_i),Y^{mis}(s_i), Y^{obs}(s_i)|Z(s_i), \theta),
\end{align}
which is convenient for its proportionality to the standard posterior distribution of $\theta$ had all of the potential outcomes been observed \citep{jin_principal_2008}.  Thus, our computational strategy will consist of a Markov chain Monte Carlo (MCMC) data augmentation algorithm that iteratively samples missing potential outcomes conditional on observed data and parameters, then samples parameters and calculates causal effect estimates conditional on ``complete'' data with identified principal strata. 

\subsubsection{Prior distributions and outline of MCMC strategy}
The mechanics of model (\ref{lmcmodelpre}) rely on two key features: the relationship among potential pollution concentrations within a location, and the decay of their correlations across space.  For the relationship among pollutants within a location, note that the cross covariance within a location, $K(s,s)$, is in fact a covariance matrix of the $2$ random effects corresponding to potential pollution concentrations at a common location.  We write $K(s,s)=LL^T$, where $L$ is the lower-triangular Cholesky square root of this covariance matrix, and assume that $K(s,s)$ is the same for all $s$, that is, that the process is stationary.  

For the spatial decay, we define a simpler MVGP, $\tilde{W}(s)$, such that $Var(\tilde{W}_k(s))=1$ and the cross covariance is diagonal: $\tilde{K}(s_i,s_j; \nu)=diag\{\rho_k(s_i,s_j;\nu_k)\}$, where $\rho_k(s_i,s_j ; \nu_k)$ represents a function for the spatial decay of the correlation between the $k^{th}$ element of $\tilde{W}(s)$ across space.  We assume isotropic exponential covariance functions that depend only on the Euclidean distance between locations $s_i$ and $s_j$ ($||s_i-s_j||$), with $\rho_k(s_i, s_j)=e^{-\nu_k||s_i-s_j||}$.  The covariance matrix of $\tilde{W}=[\tilde{W}(s_i)]_{i=1}^n$ can be written as  $\Sigma_{\tilde{W}}=[\tilde{K}(s_i, s_j)]_{i,j=1}^n$.

Rather than model $K(s_i,s_j;\nu)$ directly, we separately specify $\tilde{K}(s_i,s_j;\nu)$ and $LL^T$, and define $W(s)=L\tilde{W}(s)$, which implies that the spatial random effects in (\ref{lmcmodelpre}) are a linear transformation of the simpler MVGP, with transformation defined by the relationships among the pollutants.  With this specification, $K(s_i,s_j;\nu)=L\tilde{K}(s_i,s_j;\nu)L^T$.  

Let $\mathbf{X}^T=[(X_0(s_i)^T, X_1(s_i)^T)]_{i=1}^n$ be the $2n \times 1$ pollution vector and $\mathcal{Z}$ be the $2n \times p$ matrix of regressors.  The above model can, after marginalization over $\tilde{W}$, be equivalently stated as
\begin{align}
\mathbf{X} \sim MVN(\mathcal{Z}\beta, \mathcal{L}\Sigma_{\tilde{W}}\mathcal{L}^T + I_n \otimes \Psi)
\end{align}
where $\mathcal{L}=I_n \otimes L$, and $\otimes$ is the Kronecker product.   Details for this model formulation as well as generalizations can be found in \cite{zigler_estimating_2012}, \cite{wackernagel_multivariate_2003}, \cite{finley_spbayes:_2007}, and  \cite{banerjee_gaussian_2008}.

For the MCMC, $K(s,s)$ is updated via updates  of $L_\mathbf{a}$, which are the lower-triangular Cholesky roots of the $q\times q$ diagonal blocks of $K(s,s)$ that are informed by the observed data.  The off-diagonal blocks of $K(s,s)$ are updated according to the pre-specified value of a sensitivity parameter, $\omega$, and the values of $L_\mathbf{a}$, subject to a positive-definiteness constraint.  For this analysis, we fix $\omega=0$.  Under the prior specification detailed below, the posterior distribution for $\beta$ is multivariate normal, with samples drawn using a fully-conditional Gibbs step.  All other parameters and missing data are updated with a Metropolis step using normal proposal distributions, with appropriate transformations for all variables having restricted support.  Each diagonal element of $\Psi$, each $\nu$, and each missing quantity are updated individually, with block updating carried out for $\beta$, $\alpha^0=(\alpha_0^0, \alpha_1^0, \alpha_2^0)$, $\alpha^1=(\alpha_0^1, \alpha_1^1, \alpha_2^1)$, and $(L_0,L_1)$. Fore each model, MCMC chains are run for 32000 iterations.  After discarding the first 5,000 iterations as burn in, inference is based on every 10$^{th}$ posterior sample.  
  
As pointed out in \cite{finley_spbayes:_2007}, values of $\nu$ are only weakly identifiable and require reasonably informative priors for satisfactory MCMC behavior, but the model decomposition described above entails adequate structure to identify $\Sigma_{\tilde{W}}, L, \Sigma_W$, and $\Psi$.  We treat the parameters $\beta, \Psi, L_0,L_1, \nu, \alpha^0,$ and $\alpha^1$, as \textit{a priori} independent.  We specify flat priors for $\beta, \alpha^0$ and $\alpha^1$.  For the diagonal elements of $\Psi$, we specify independent inverse-gamma distributions with shape parameters set to 2 and scale parameters set to 0.5.  For $\nu_k$, we specify uniform prior distributions on the interval $(0.45,3.38)$.  Parameters for the prior distributions of $\Psi$ and $\nu$ are meant to reflect diffuse prior information within the range of plausible parameter values.  For each diagonal element of $K(s,s)$, we specify an inverse-ganna prior distribution with shape parameter set to 2 and scale parameter set to 0.5. 

\subsection{Assumption about interference between locations}\label{app:pm10:interference}
Mortality outcomes and pollution levels are only observed under the program $\mathbf{A=a^{obs}}$.  Therefore, we require assumptions to relate observed potential outcomes to those that would have been observed under the hypothetical scenario with no nonattainment designations, which we denote with $\mathbf{A} = \mathbf{0}$.  Typically, this would be achieved with the assumption of no interference between observational units (or the Stable Unit Treatment Value Assumption, \cite{rubin_discussion_1980}), which states that potential outcomes for a given location are unrelated to designations of all other locations.  This assumption implies that there are exactly two sets of potential outcomes for each location: pollution and mortality if that location is regulated and pollution and mortality if that location is unregulated.  Thus, with no interference, potential outcomes under any hypothetical program $\mathbf{A=a}$ could be considered on a location-by-location basis, with $(X_\mathbf{a}(s), Y_\mathbf{a}(s)) = (X_\mathbf{a^{obs}}(s), Y_\mathbf{a^{obs}}(s))$ as long as $\mathbf{a}$ and $\mathbf{a^{obs}}$ entail the same regulation status for location $s$.  

In studies of air pollution, however, the assumption of no interference does not likely hold because regulations at a given location likely impact air quality at nearby locations.  Thus, knowing the observed potential outcomes at location $s$ under $\mathbf{A=a^{obs}}$ does not imply knowledge of the potential outcomes under any other $\mathbf{A=a}$ because potential outcomes for $s$ can differ when regulations are allocated differently to other locations.  In fact, with no assumptions regarding interference, potential outcomes for each location are distinctly defined for each different possible regulation programs because changing the regulation designation of any location could impact potential outcomes at all other locations. 

We liken investigation of the nonattainment designations to previously-considered problems of ``partial interference'' \citep{sobel_what_2006} where observations within a clearly-defined group (e.g., residents of a particular neighborhood) interfere with one another, but observations in different groups (e.g., residents of distant neighborhoods) do not.  Unlike previously considered partial-interference settings, there are no clearly defined interference sets for analyzing the \PMTen nonattainment designations (e.g., assuming no interference between locations in different counties might be too restrictive, especially for observations near county borders).  We argue that a unique feature of the present context is that nonattainment designations were ``assigned'' with some implicit regard to interference because one criterion for a nonattainment designation was contribution to a NAAQS violation in a nearby area.  That is, if weather patterns or mere proximity led pollution in one location to affect pollution in another location, the EPA ensured that these two locations shared the same regulation designation.   



Let $\mathcal{R}^\mathbf{a^{obs}}$ and $\mathcal{U}^\mathbf{a^{obs}}$ respectively denote the set of 219 nonattainment (i.e., ``regulated'') and 276 attainment (i.e., ``unregulated'') locations under the program $\mathbf{A=a^{obs}}$.  We adopt what we term the \textit{assignment group interference assumption (AGIA)} to reflect the notion that locations within $\mathcal{R}^\mathbf{a^{obs}}$ do not interfere with those in $\mathcal{U}^\mathbf{a^{obs}}$.  Thus, changing the regulation designation of any location in $\mathcal{U}^\mathbf{a^{obs}}$ would not change the potential outcomes of locations in $\mathcal{R}^\mathbf{a^{obs}}$  (and \textit{vice versa}).  A consequence of this assumption is that  $(X_\mathbf{1}(s), Y_\mathbf{1}(s)) = (X_\mathbf{a^{obs}}(s), Y_\mathbf{a^{obs}}(s))$ for $s \in \mathcal{R}^\mathbf{a^{obs}}$ and  $(X_\mathbf{0}(s), Y_\mathbf{0}(s)) = (X_\mathbf{a^{obs}}(s), Y_\mathbf{a^{obs}}(s))$ for $s \in \mathcal{U}^\mathbf{a^{obs}}$. Figure \ref{interferefig} graphically depicts the implication of AGIA.  In practice, this assumption implies that, in all attainment areas ($\mathcal{U}^\mathbf{a^{obs}}$), observed potential outcomes are the same as those that would have occurred if the EPA had not designated any other area (i.e., if there were no nonattainment designations).  The AGIA is discussed further in \cite{zigler_estimating_2012}.  


\begin{figure}[h]
\center
\caption{Structure of potential outcomes for different regulation programs under the Assignment Group Interference Assumption.  Points represent monitor locations in counties contained in portions of California and Arizona.}\label{interferefig}
\subfloat[$\mathbf{A=a^{obs}}$. All pollution and mortality outcomes are observed under this program.]{
\includegraphics[width=.3\textwidth , trim=0in 1.5in 0in 1in, clip]{Figures/interferefigobs.pdf}}
\hspace{.5in}
\subfloat[$\mathbf{A=0}$. Pollution and mortality outcomes are observed for locations in $\mathcal{U}^\mathbf{a^{obs}}$ and unobserved for those in $\mathcal{R}^\mathbf{a^{obs}}$.]{
\includegraphics[width=.3\textwidth, trim=0in 1.5in 0in 1in, clip]{Figures/interferefigA0.pdf}}
\hspace{.5in}
\subfloat[$\mathbf{A=1}$. Pollution and mortality outcomes are observed for locations in $\mathcal{R}^\mathbf{a^{obs}}$ and unobserved for those in $\mathcal{U}^\mathbf{a^{obs}}$.]{
\includegraphics[width=.3\textwidth, trim=0in 1.5in 0in 1in, clip]{Figures/interferefigA1.pdf}}
\end{figure}




\newpage
\section{Technical Details for Case Study 2: Power Plant Emissions Controls}\label{app:pp}
\subsection{Defining the Treatment: Scrubber Installations on Power Plants}\label{app:pp:scrubbers} 
Figure \ref{histogram} shows the distribution of the \% of each power plant's total heat input that is generated for an EGU with a scrubber.  Note that the vast majority of power plants have scrubbers on all or none of their EGUs.
\begin{figure}[h]
\center
\caption{Histogram of the \% of heat input generated from an EGU with a scrubber for each power-generating facility.}
\includegraphics[width=.7\textwidth, trim=0in .5in 0in 0in, clip]{Figures/histogram.pdf}\label{histogram}
\end{figure}

\subsection{Bayesian Nonparametric Models for Observed Distributions of Mediators and Outcome}\label{app:pp:obsmods}
We try to minimize parametric assumptions in our specification of the
models for the observed data.  In particular, we specify Dirichlet process mixtures of
multivariate normals \citep{escobar_bayesian_1995, muller_bayesian_1996, jara_dppackage:_2011} for the
distribution of  each mediator (emission). For each
intervention $z=0,1$ and baseline covariates $\boldsymbol{X}=\boldsymbol{x}$, the conditional distribution of the $k$-th observed
mediator is specified as
\begin{eqnarray*}
M_{k,i} | Z=z, \boldsymbol{X}=\boldsymbol{x} & \sim &
N(\beta^z_{k0,i}+\boldsymbol{x}^\top \boldsymbol{\beta}^z_{k1},\,\,
\tau^z_{k,i}), \qquad k=1,2,3;\, i=1,\cdots, n^z\\
\beta^z_{k0, i}, \tau^z_{k,i} & \sim & F_{k}^z,\\
F_{k}^z & \sim & DP (\lambda_{k}^z, \,\,\mathcal{F}_{k}^z),
\end{eqnarray*}
where subscript $k$ indicates the $k$-th mediator and $i$ indicates the
$i$-th observation and superscript $z$ indicates the intervention
received such that $\beta^z_{k0,i}$ and $\tau^z_{k,i}$
denote the intercept and precision parameters for the $i$-th
observation of the $k$-th mediator, respectively. Here, $DP$ denotes the Dirichlet process with two parameters, a mass
parameter ($\lambda_k^z$) and a base measure ($\mathcal{F}_k^z$) for
each mediator $k$ and intervention $z$. To not overly complicate the model we only `mixed' over the
intercept parameters and precisions in the conditional distributions,
$\beta^z_{k0, i}$ and $\tau^z_{k,i}$. The base distribution $\mathcal{F}_k^z$ is taken to be the
conjugate normal-Gamma distribution,
$N(\mu_k^z, S_k^z) G(a_k^z,b_k^z)$, where $S_k^z$ is the precision parameters and the Gamma is
parametrized as the mean to be $a_k^z / b_k^z$ and we set a
Gamma prior $G(1,1)$ on $\lambda_k^z$, the inverse of the mass parameter
$\lambda_k^z$ \citep{rasmussen_infinite_1999}. For the hyper-priors,
we follow the specification from \cite{taddy_Bayesian_2008} such
that $\mu_k^z \sim (\mu_k^{z\star}, S_k^{z\star}), S_k^z \sim G(a_k^{z\star},
b_k^{z\star})$ and $a_k^z \sim \text{Unif}(0.1, 10), b_k^z  = a_k^z \times
\hat{\Sigma}_k^z/2$ where $\hat{\Sigma}_k^z$ is the MLE of the variance of
$M_k(z)$. $S_k^{z\star}$ is set to $2/\hat{\Sigma}_k^z$ and $\mu_k^{z\star}$ is
set to the mean of the data. And $a_k^{z\star} \sim \text{Unif}(0.1, 10), b_k^{z\star}  = a_k^z \times
\hat{\Sigma}_k^z/2$. From these specifications, $E(\tau_{k,i}^{z}) =E(S_k^{z}) =
E(S_k^{z\star}) = \hat{\Sigma}_k^z/2$ (i.e., the expected variance
components are an attenuated value of the MLE of the variance of the
data) in order to avoid fitting only one global distribution over the
whole data points.

Similarly, we can specify Dirichlet process mixtures of multivariate
normals for the distribution of each outcome. For each intervention
$z=0, 1$, the conditional distribution of the observed outcome (ambient \PMTwo) is
specified as
\begin{eqnarray*}
Y_{i} | Z=z,  \boldsymbol{X}=\boldsymbol{x} & \sim &
N(\gamma^z_{0,i}+\boldsymbol{x}^\top \boldsymbol{\gamma}^z_{1},\,\,
\omega^z_{i}), \qquad i=1,\cdots, n^z\\
\gamma^z_{0, i}, \omega^z_{i} & \sim & G^z,\\
G^z & \sim & DP (\eta^z,\,\, \mathcal{G}^z),
\end{eqnarray*}
where all details follow in the same manner as the specification of
the mediator models. 

These observation models can be represented as
the stick-breaking construction \citep{sethuraman_constructive_1994}
which can be approximated by a finite mixture of normals such that,
for example, the conditional distribution of $M_1$ under intervention $z=1$ can be
represented as
\[f_{M_1}(m|z=1, \boldsymbol{x}) = \sum_{n=1}^N \theta_{n} N(m\, ;\,
\beta_{10,n}^{z=1} + \boldsymbol{x}^\top \boldsymbol{\beta}_{11}^{z=1}, \tau_{1,n}^{z=1}  ), \]
where $\theta_n = \theta^\prime_{n} \prod_{h<n} (1-\theta^\prime_h),
\theta^\prime_h \sim \text{Beta}(1, \lambda_1)$, and $(\beta_{10,n}^{z=1},
\tau_{1,n}^{z=1}) \overset{iid}{\sim} \mathcal{F}_1^{z=1}$ and $N$ is a
maximum number of clusters.

With the marginal distributions specified as above, we can specify the joint distribution of the outcome and the
mediators under the same intervention via a Gaussian copula model
\citep{nelsen_introduction_1999} and each conditional distribution of the mediator and
the outcome above. Specifically, it has the following form for each
intervention $Z=0, 1$:
\begin{eqnarray*}
 \lefteqn{F_{M_1,M_2,M_3, Y}(m_1, m_2,
   m_3, y \,|\,Z=z, \boldsymbol{X}=\boldsymbol{x}) =} \\
 & &
 \Phi_{4}[\Phi_{1}^{-1}\{F_{M_1}(m_1|Z=z,
 \boldsymbol{x})\},\Phi_{1}^{-1}\{F_{M_2}(m_2|Z=z, \boldsymbol{x})\},
 \Phi_{1}^{-1}\{F_{M_3}(m_3|Z=z,
 \boldsymbol{x})\},\Phi_{1}^{-1}\{F_{Y}(y|Z=z, \boldsymbol{x})\}],
\end{eqnarray*}
where $\Phi_1$ is the univariate standard normal CDF and $\Phi_4$ is
the multivariate normal CDF with mean $\mathbf{0}$ and a correlation
matrix $R$. Since the outcome and the mediators under the same
intervention are observed by the data, the correlation matrix $R$ can
be identified by the observed data. 

\subsection{Assumptions for Identification of Principal Causal Effects and Mediation Effects}\label{app:pp:assumptions}
The models in Section \ref{app:pp:obsmods} are for the mediator and outcome values that are actually observed.  Identification of causal effects requires estimation of both observed and unobserved potential outcomes for each power plant.  This requires assumptions related to the unobserved but {\it observable} outcomes for the principal causal effects, and additional assumptions related to unobserved and {\it unobservable} outcomes for the mediation effects.

The conditional distribution $[Y(z; \bM(z_1, z_2,
z_3)) \,|\, \bM(0,0,0) = \mathbf{m}_{0,0,0}, \bM(1,1,1)=\mathbf{m}_{1,1,1}, \boldsymbol{X}=\boldsymbol{x}]$ is denoted by
$f_{z, \bM(z_1,z_2,z_3)}(y\,|\,\mathbf{m}_{0,0,0}, \mathbf{m}_{1,1,1},
\boldsymbol{x})$ where $\mathbf{m}_{z_1, z_2, z_3}$ is a vector of
realized values of $\text{SO}_2$, $\text{NO}_x$ and $\text{CO}_2$ under the interventions $z_1, z_2, z_3$. The conditional distribution $[\bM(z_1,z_2,z_3)|\boldsymbol{X}=\boldsymbol{x}]$
is denoted by 
$f_{\bM(z_1,z_2,z_3)}(\mathbf{m}_{z_1,z_2,z_3} |
\boldsymbol{x})$. Other conditional distributions are defined using
similar notation. 

\subsubsection{Assumptions for the principal causal effects}\label{app:pp:principalstrataassumptions}

{\as {\bf (Ignorability of treatment)} $$\{Y(z; \bM(z,z,z)),
  \bM(z,z,z) \}\independent Z | \boldsymbol{X}=\boldsymbol{x},$$} for
$z = 0,1$, which assumes that $\text{SO}_2$ scrubber installation status is ``randomized'' conditional on
$\boldsymbol{X}=\boldsymbol{x}$. With this assumption, we can identify the conditional
distributions of potential \PMTwo outcomes and the conditional distributions of emissions outcomes under each intervention based on observed data using the models from Section \ref{app:pp:obsmods} denoted with $f_{z, \bM(z,z,z)} (y | \boldsymbol{x})$ and $f_{M_k(z)}
(m|\boldsymbol{x})$.

{\as The joint distribution of potential mediators and outcomes
   conditional on covariates follows a Gaussian copula model \citep{nelsen_introduction_1999}:
\begin{eqnarray*}
 \lefteqn{F_{\bM(0,0,0),\bM(1,1,1) ,(0,\bM(0,0,0)),(1,\bM(1,1,1))}(\mathbf{m}_{0,0,0}, \mathbf{m}_{1,1,1}, y_0, y_1) =} \\
 & &
 \Phi_{8}[\Phi_{1}^{-1}\{F_{M_1(0)}(m_1)\},\Phi_{1}^{-1}\{F_{M_2(0)}(m_2)\},
 \Phi_{1}^{-1}\{F_{M_3(0)}(m_3)\},\Phi_{1}^{-1}\{F_{M_1(1)}(m_1)\},\\
& &\Phi_{1}^{-1}\{F_{M_2(1)}(m_2)\},\Phi_{1}^{-1}\{F_{M_3(1)}(m_3)\},\Phi_{1}^{-1}\{F_{(0,\bM(0,0,0))}(y_0)\} ,\Phi_{1}^{-1}\{F_{(1,\bM(1,1,1))}(y_1)\}]
\end{eqnarray*}
where $y_z$ indicates a value of potential outcome under intervention
$Z=z$ and $\Phi_1$ is the univariate standard normal CDF and $\Phi_{8}$ is the multivariate normal CDF with mean $\mathbf{0}$ and a correlation matrix $R$.}

Here, for notational simplicity, we omit
covariates, $\boldsymbol{X}$, when noting conditional distributions of mediators
and outcomes. In Section \ref{app:pp:obsmods}, the joint distribution of
the observed data (outcome and mediators under the same scrubber status)
was specified as a separate Gaussian copula model for each intervention $Z=z$,
and we are now assuming the joint distribution of potential
outcomes and mediators under both interventions $Z=0, 1$.
Through this model, we have a benefit of
flexibility in the marginal distributions (which we modeled earlier using
Bayesian nonparametric methods). Since any pair of $M_j(0)$ and
$[M_k(1), Y(1; \bM(1, 1, 1))]$ or any pair of
$Y(0; \bM(0, 0, 0))$ and $[M_k(1), Y(1;\bM(1,1,1))]$ given
covariates $\boldsymbol{X}=\boldsymbol{x}$ cannot be observed at the same time, we
cannot identify part of the correlation structure of the joint distribution from the observed data.  To identify these nonidentifiable pieces, we first adopt strategy from
\cite{zigler_estimating_2012} to specification of the correlations between
mediators under different interventions. Specifically, we set
\begin{itemize}
\item[(1)] $cor(M_j(0), M_k(1)) = \frac{cor(M_j(0), M_k(0))+cor(M_j(1), M_k(1))}{2}\times
\rho_1, \quad \text{ for   } \,\,  j, k=1, 2, 3,$
\end{itemize}
where $\rho_1$ is a sensitivity parameter. This strategy implies that
(a) the correlation between the same mediator $(j = k)$ under opposite
interventions
is $\rho_1$, and (b) the correlation between different mediators $(j
\neq k)$
under opposite interventions is attenuated version of the correlation
observed separately under each intervention. Furthermore, it is
reasonable to assume that magnitude of correlations (b) do not
exceed the observed correlations between two mediators under the same
intervention. Thus, we set a constraint
$ |\rho_1| \leq 2\times \frac{\text{min} \left\{|cor(M_j(0), M_k(0))|, \,|cor(M_j(1), M_k(1))|\right\}}{|cor(M_j(0), M_k(0))+cor(M_j(1), M_k(1))|}$ for
all $j \neq k$ combinations. We use one sensitivity parameter to specify these correlations, but a different parameter could be specified for each mediator separately.  For the analysis presented in the main text, we specify a uniform prior distribution over $(0, 0.75)$ for $\rho_1$.  

We also need to specify correlations between $Y(0; \bM(0,0,0))$ and
$M_k(1)$ and between  $Y(1; \bM(1,1,1))$ and
$M_k(0)$ for all $k=1,2,3.$ This can be done by the same strategy from
the above specification for $k=1,2,3$
\begin{itemize}
\item[(2)] $cor(M_k(z^\prime), Y(z; \bM(z,z,z))) 
 =   \frac{cor(M_k(0), Y(0; \bM(0,0,0)))+cor(M_k(1), Y(1; \bM(1,1,1)))}{2}\times
\rho_2, $
\end{itemize}
where $z^\prime = 1-z$ for $z \in \{0,1\}$ and $\rho_2$ is another sensitivity parameter. This specification
implies that the correlation between mediators under intervention
$z=1$ ($z=0$) and the outcome under intervention $z=0$ ($z=1$) is attenuated version
of the correlations between the mediators and the outcome under the same
interventions. We also restrict a
range of possible values as $|\rho_2| \leq 2 \times \frac{\text{min} \left\{|cor(M_k(0), Y(0; \mathbf{M}(0,0,0)))|,\, |cor(M_k(1), Y(1; \mathbf{M}(1,1,1)))|\right\}}{|cor(M_k(0), Y(0, \mathbf{M}(0,0,0)))+cor(M_k(1), Y(1, \mathbf{M}(1,1,1)))}.$  For the analysis presented in the main text, we specify a uniform prior distribution over $(0, 0.15)$ for $\rho_2$.  




Note that we do not need to specify a third sensitivity parameter for the nonidentifiable association between 
$(Y(0;\bM(0,0,0), Y(1;\bM(1,1,1))$. Although this would be required to construct
the full joint distribution of all observable potential outcomes, the estimands we consider do not depend on this correlation.  This is due in part to the following assumption:

{\as {\bf (Conditional independence \rom{1})} $Y(0; \bM(0,0,0))$ and $Y(1; \bM(1,1,1))$ are conditionally independent
given all potential mediators under $z=0$ and $z=1$ and covariates $\boldsymbol{X}$.}

\noindent This assumption states that the potential values of
$\text{PM}_{2.5}$ under both interventions are independent of each
other conditional on all potential values of $\text{SO}_2$,
$\text{NO}_x$ and $\text{CO}_2$ under both interventions and baseline covariates.
This is not necessary to estimate the posterior means of
the principal causal effects but necessary to estimate other features
of the posterior distribution of the principal causal effects such as
bounds for posterior variances. In this paper, the mean differences
are our primary causal estimands and, therefore, this assumption is
not needed.



\subsubsection{Assumptions for the mediation effects}\label{app:pp:mediationassumptions}
Assumptions 1-3 pertain exclusively to observed outcomes and unobserved but {\it observable} outcomes of ambient \PMTwo and emissions. As noted in the main text, identification of natural indirect effects requires assumptions that relate observed outcomes to {\it unobservable} potential outcomes of \PMTwo and emissions that are simultaneously subject to different scrubber statuses.  We specify such assumptions here. 

\noindent {\bf Assumption 3$^\bigstar$. (Conditional Independence \rom{2})} {\it $\{Y(z; \bM(z_1, z_2, z_3)) :
  (z, z_1, z_2, z_3) \in \{0,1\}^{\otimes 4}\} $ are conditionally
  independent given all potential mediators and covariates $\boldsymbol{X}$.}

It is important to note that Assumption 3 is a specific case of
Assumption 3$^\bigstar$.  Again, this assumption is not necessary to estimate posterior means of
NDE, JNIE's and other indirect effects but necessary to estimate other
features of the posterior distribution of mediation effects such as  an upper bound on the
posterior variance of the indirect effect assuming a non-negative
correlation between potential outcomes.


{\as For a given intervention $Z=1$, the conditional distribution of
  the potential outcome given all potential mediators and covariates is the same whether corresponding mediators were induced by $Z=1$ or $Z=0$.}

This assumption implies that the unobservable potential outcomes $Y(1; \bM(0, 0, 0))$ and  the observed potential outcomes $Y(1; \bM(1,1,1))$ have the same conditional distribution,
\begin{eqnarray}
\lefteqn{f_{1,\bM(0,0,0)}(y\,|\,\bM(0,0,0)=\mathbf{m},
  \bM(1,1,1), \boldsymbol{x})} \nonumber\\  
& = & f_{1,\bM(1,1,1)}(y\,|\,\bM(0,0,0),
\bM(1,1,1) =\mathbf{m}, \boldsymbol{x}) \label{as1}, 
\end{eqnarray}
where the conditional distribution of the RHS has
$\mathbf{m}$ as a vector of realized values of
when an $\text{SO}_2$ scrubber is installed. That is, the conditional distribution of the unobservable potential \PMTwo concentration when a scrubber is installed but emissions are set to the value $\mathbf{m}$ that they would have been absent the scrubber is equal to the observed distribution of \PMTwo observed around power plants that had scrubbers and were observed to have emissions $\mathbf{m}$.


This assumption is also defined for cases of any two mediators or
single pollutant(s) emitted under different interventions. For
instance, the potential outcomes of $\text{PM}_{2.5}$ $Y(1; \bM(0, 1, 0))$ and $Y(1; \bM(1,
1, 1))$ have the same conditional distribution regardless of whether
corresponding emissions values arose under a scrubber
($Z=1$) or absent a scrubber ($Z=0$),
\begin{eqnarray*}
\lefteqn{f_{1,\bM(0,1,0)}(y\,|\,\bM(0,1,0)=\mathbf{m},
  \bM(1,0,1) , \boldsymbol{x}) }\\
& = &f_{1,\bM(1,1,1)}(y\,|\,\bM(0,0,0),\bM(1,1,1)=\mathbf{m}, \boldsymbol{x}).
\end{eqnarray*}
Similarly, the potential outcomes $Y(1; \bM(1, 1, 0))$ and $Y(1; \bM(1, 1, 1))$ have the same conditional distribution,  
\begin{eqnarray*}
\lefteqn{f_{1,\bM(1,1,0)}(y\,|\,\bM(1,1,0)=\mathbf{m},
  \bM(0,0,1) , \boldsymbol{x})}\\
 & = & f_{1,\bM(1,1,1)}(y\,|\, \bM(0,0,0), \bM(1,1,1)=\mathbf{m}, \boldsymbol{x}).
\end{eqnarray*}

The key point is that unobservable \PMTwo concentrations under a certain hypothetical emissions amount ($\mathbf{m}$) are assumed to have the same distribution as that of the \PMTwo concentrations among power plants that had, in reality, emissions of $\mathbf{m}$ observed, regardless of observed scrubber status. 

It is worth noting that this Assumption 4 induces the following property 
\begin{eqnarray}
Y(1;\bM(1,1,1)) \independent \bM(0,0,0) | \bM(1,1,1)=\mathbf{m},
\boldsymbol{X}=\boldsymbol{x}\label{additional}
\end{eqnarray}
for all $\mathbf{m}$ and $\boldsymbol{x}$ values since Equality (\ref{as1}) holds regardless of $\bM(0,0,0)$ in the
conditioning part of the RHS for all realized values of $\bM(1,1,1)$
and $\boldsymbol{X}$. This property simplifies posterior computation.

Also, note that Assumption 4 is consistent with Assumption 2 (the joint
distribution of outcomes and mediators) since Assumption 2 only
  impacts on the conditional distribution of $Y(1;\bM(1,1,1))$ which is
  the RHS of Assumption 4.

We summarize assumptions proposed in the previous section and this
section with Table \ref{Assumption}. 
\begin{table}
\centering
\begin{tabular}{|c|c|c|}
\hline
Effects & Assumptions & Note\\
\hline
\hline
Principal Causal Effect & A1, A2, A3 & \\
\hline
Mediation Effect & A1, A2, A3$^\bigstar$, A4 & A3$^\bigstar$ implies A3\\
\hline
\end{tabular}
\caption{Assumptions needed for identifying each effect. `A' indicates
assumption.}\label{Assumption}
\end{table}
Assumption 1-3 are sufficient to identify the principal causal
effects. To identify the mediation effects, we replace Assumption 3
with stronger assumption, Assumption 3$^\bigstar$ and propose an
additional assumption, Assumption 4. It is again worth noting that
Assumption 1-3 only contain the observable outcomes while 
Assumption 3$^\bigstar$-4 additionally contain the unobservable outcomes. Thus, we modularize assumptions so that estimation of principal causal effects rely on Assumptions 1, 2, and 3, while estimation of mediation effects relies on Assumptions 1, 2,  3$^\bigstar$, and 4.  

\subsubsection{Proofs of Identification}\label{app:pp:identification}
In the following, we will prove that Assumptions 1-3 are sufficient to
identify the distribution of the principal causal effects and
Assumption 1,2,3$^\bigstar$ and 4 are sufficient to identify the
distributions of NDE, JNIE's and NIE's. Here, we proceed with 3
mediators, $\text{SO}_2$, $\text{NO}_x$, and $\text{CO}_2$, but it is
straightforward to extend this to $K\geq 3$ cases.  

{\thm The posterior distributions of the principal causal effects
  (associative effect and dissociative effect) are
  identified under Assumption 1-4.}

\noindent {\bf Proof :}

To obtain the posterior distribution of the principal causal effect,
it is sufficient to identify the conditional joint distribution of the potential
outcomes and mediators $[Y(0;\bM(0,0,0))= y_0, Y(1;\bM(1,1,1)) = y_1,
\bM(0,0,0)=\mathbf{m}_{0,0,0}, \bM(1,1,1)=\mathbf{m}_{1,1,1}],$
\begin{eqnarray}
\lefteqn{f(y_0, y_1,\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})}\nonumber\\
& = &\int f_{(0,\bM(0,0,0)), (1,\bM(1,1,1))}(y_0,
y_1|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1},\boldsymbol{x})\,
f_{\bM(0,0,0),
  \bM(1,1,1)}(\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1}|\boldsymbol{x})
dF_{\boldsymbol{X}} (\boldsymbol{x}) \nonumber\\
& =& \int \Big\{ f_{0,\bM(0,0,0)}(y_0|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1},\boldsymbol{x})
\,
f_{1,\bM(1,1,1)}(y_1|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1},\boldsymbol{x}) \Big\}\, f_{\bM(0,0,0),
   \bM(1,1,1)}(\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1}|\boldsymbol{x})
 dF_{\boldsymbol{X}} (\boldsymbol{x})   \label{fact1}
\end{eqnarray} 
where the equality in (\ref{fact1}) from Assumption 3 all terms are
identified by Assumption 1 and 2 with the observed data model. Then, the posterior distribution of the
principal causal effect is a function of (\ref{fact1}).

{\thm The posterior distributions of \textnormal{NDE} $\textnormal{JNIE}_{123}$,
  $\textnormal{JNIE}_{jk}$, and $\textnormal{NIE}_{k}$ are identified
  under Assumptions 1, 2, 3$^\bigstar$ and 4. }

\noindent {\bf Proof :}

To obtain the posterior distributions of NDE, JNIE's and NIE's, it is sufficient to identify the joint distribution of the potential outcomes $[Y(1; \bM(1,1,1)) = y_1, Y(0; \bM(0,0,0)) = y_2, Y(1; \bM(0,0,0)) = y_3, Y(1; \bM(1,0,0)) = y_4, Y(1; \bM(0,1,0)) = y_5, Y(1; \bM(0,0,1)) = y_6, Y(1; \bM(1,1,0)) = y_7, Y(1; \bM(1,0,1)) = y_8, Y(1; \bM(0,1,1)) = y_9]$,
\begin{eqnarray*}
\lefteqn{f(y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8, y_9) =}\\
& & \int \Big\{ f(y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8,
y_9\,|\,\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1}, \boldsymbol{x})\\
& & \,\,\,\,\,\,\,\,\,\,\,\, \times
f_{\bM(0,0,0),\bM(1,1,1)}(\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1}\,|\,\boldsymbol{x}) \Big\}
\,d\mathbf{m}_{0,0,0} \,d\mathbf{m}_{1,1,1} \,dF_{\boldsymbol{X}}(x),
\end{eqnarray*}
where $\mathbf{m}_{z_1,z_2,z_3}$ denotes a vector of realized values of
the mediators $\{M(z_1), M(z_2), M(z_3) \}$.
With omitting $\boldsymbol{x}$ for notation simplicity, the second term in the RHS can be identified by Assumption 2. The first term in the RHS can be factored as
\begin{eqnarray}
\lefteqn{f(y_1, y_2, y_3, y_4, y_5, y_6, y_7, y_8,
  y_9|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})}\nonumber\\ 
& =&  f_{1,\bM(1,1,1)}(y_1|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} )
\,\,f_{0,\bM(0,0,0)}(y_2|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})\nonumber\\
& & \times f_{1,\bM(0,0,0)}(y_3|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} )
\,\,f_{1,\bM(1,0,0)}(y_4|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} ) \nonumber\\
& & \times f_{1,\bM(0,1,0)}(y_5|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})\,\,f_{1,\bM(0,0,1)}(y_6|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} )\nonumber\\
& & \times f_{1,\bM(1,1,0)}(y_7|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})\,\,f_{1,\bM(1,0,1)}(y_8|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1})\nonumber\\
& & \times f_{1,\bM(0,1,1)}(y_9|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} ) \\
& =&  f_{1,\bM(1,1,1)}(y_1|\bM(1,1,1)=\mathbf{m}_{1,1,1} )
\,\,f_{0,\bM(0,0,0)}(y_2|\mathbf{m}_{0,0,0},\mathbf{m}_{1,1,1} )\nonumber\\
& & \times f_{1,\bM(1,1,1)}(y_3|\bM(1,1,1)=\mathbf{m}_{0,0,0} )
\,\,f_{1,\bM(1,1,1)}(y_4|\bM(1,1,1)=\mathbf{m}_{1,0,0}) \nonumber\\
& & \times f_{1,\bM(1,1,1)}(y_5|\bM(1,1,1)=\mathbf{m}_{0,1,0} )\,\,f_{1,\bM(1,1,1)}(y_6| \bM(1,1,1)=\mathbf{m}_{0,0,1} )\nonumber\\
& & \times f_{1,\bM(1,1,1)}(y_7|\bM(1,1,1)=\mathbf{m}_{1,1,0} )\,\,f_{1,\bM(1,1,1)}(y_8|\bM(1,1,1)=\mathbf{m}_{1,0,1} )\nonumber\\
& & \times f_{1,\bM(1,1,1)}(y_9|\bM(1,1,1)=\mathbf{m}_{0,1,1})  \label{fact2}
\end{eqnarray} 
where the first equality follows from Assumption 3$^\bigstar$ and the second
equality follows from Assumption 4. All terms in (\ref{fact2}) are
identified by Assumption 2. Note that, in (\ref{fact2}), all conditional
distributions for potential outcomes except one for $Y(0,\bM(0,0,0))$
are independent of $\bM(0,0,0)$ which simplify the posterior computation
and result in efficient estimates. Then, the posterior distributions of NDE,
JNIE's and NIE's are functions of (\ref{fact2}).



\subsection{Posterior Computation}\label{app:pp:computation}
In this section, we provide the MCMC computational algorithm for estimating principal causal effects and the mediation effects.
To compute the posterior distribution of the principal causal effects,
for example the associative effect defined on all three pollutants
($\text{SO}_2$, $\text{NO}_x$, $\text{CO}_2$), we obtain $N$ posterior samples of the parameters in the observation models
via Stan \citep{guo_rstan:_2015}. Then, for each set of sampled parameters with
sensitivity parameters $\rho_1$ and $\rho_2$, we do the following
steps.

\small
\begin{enumerate}
\item Generate $S$ samples of $[y_0,y_1, \mathbf{m}_0,
  \mathbf{m}_1]$ from the distribution of
  $[Y(0,\bM(0,0,0)), Y(1,\bM(1,1,1)), \bM(0,0,0), \bM(1,1,1) |
  \boldsymbol{X}=\boldsymbol{x}]$ where $y_0$, $y_1$, $\mathbf{m}_{0,0,0}$ and $\mathbf{m}_{1,1,1}$ are realized samples of
  $Y(0,\bM(0,0,0))$, $Y(1,\bM(1,1,1))$, $\mathbf{M}(0,0,0)$ and $\mathbf{M}(1,1,1)$, respectively.
\item Compute $E[Y(1;\bM(1,1,1)) \,\big\vert\,| \mathbf{M}(1,1,1)
  -\mathbf{M}(0,0,0) |_{\mathcal{K}}> C_{\mathcal{K}}^A, \boldsymbol{X} =\boldsymbol{x} ]$ as follows,
\begin{eqnarray*}
E[Y(1;\bM(1,1,1)) \,\big\vert\, | \mathbf{M}(1,1,1)
  -\mathbf{M}(0,0,0) |_{\mathcal{K}}> C_{\mathcal{K}}^A, \boldsymbol{X}=\boldsymbol{x} ] = \frac{1}{S} \sum_{i=1}^{S} y_{1i} \,I (|\mathbf{m}_{1,1,1,i}
  -\mathbf{m}_{0,0,0,i} |_{\mathcal{K}}> C_{\mathcal{K}}^A),
\end{eqnarray*}
where $I()$ is an indicator function and $y_{1i}$, $\mathbf{m}_{0,0,0,i},$ and $\mathbf{m}_{1,1,1,i}$ indicate the $i$-th sample of the outcome under
intervention $z=1$ and sets of the mediators under intervention $z=0$ and
intervention $z=1$,
respectively. Analogously, compute $E[Y(0;\bM(0,0,0)) \,\big\vert\,| \mathbf{M}(1,1,1)
  -\mathbf{M}(0,0,0) |_{\mathcal{K}}> C_{\mathcal{K}}^A, \boldsymbol{X}=\boldsymbol{x} ]$.
\item Compute the associative effect,
\[
AE  = \int E[Y(1;\bM(1,1,1)) - Y(0;\bM(0,0,0)) \,\big\vert\,
|(\bM(1,1,1) - \bM(0,0,0) )|_{\mathcal{K}} > C _{\mathcal{K}}^A,\, \boldsymbol{x}] dF_{\boldsymbol{X}}(\boldsymbol{x})
\]
\end{enumerate}
\normalsize
Then, we iterate steps 1-3 $N$ times and estimate posterior means and
standard deviations of the associative effects. The dissociative
effect is computed in the exact same way with some threshold $C
_{\mathcal{K}}^D
$.



To compute the posterior distribution of the JNIE's, NDE and all
mediator-specific indirect effects, using the same $N$ sets of
parameters in the observation models from the above step, for each set of sampled
parameters and either fixed values for sensitivity parameters $\rho_1$
and $\rho_2$, we do the following
steps.
\small
\begin{enumerate}
\item Generate $S$ sets of samples $(\mathbf{m}_{0,0,0},
  \mathbf{m}_{1,1,1})$ from the joint distribution
  $[\bM(0,0,0), \bM(1,1,1) |
  \boldsymbol{X}=\boldsymbol{x}]$ where $\mathbf{m}_{0,0,0}$ and
  $\mathbf{m}_{1,1,1}$ are realized samples of $\mathbf{M}(0,0,0)$ and $\mathbf{M}(1,1,1)$, respectively.  
  \item Compute $f_{1,\bM(0,0,0)}(y | \boldsymbol{x})$ via Monte Carlo
  integration using $S$ sets of $(\mathbf{m}_{0,0,0},
  \mathbf{m}_{1,1,1})$ as follows,
\begin{eqnarray*}
\lefteqn{f_{1,\bM(0,0,0)}(y|\boldsymbol{x})}\\
 & = & \int \Big\{ f_{1,\bM(0,0,0)}(y | \bM(0,0,0)=\mathbf{m}_{0,0,0},
 \bM(1,1,1)=\mathbf{m}_{1,1,1}, \boldsymbol{x})  f_{\bM(0,0,0),
  \bM(1,1,1)|\boldsymbol{X}=\boldsymbol{x}}(\mathbf{m}_{0,0,0},
\mathbf{m}_{1,1,1}) \Big\} \,d \mathbf{m}_{0,0,0} \,d\mathbf{m}_{1,1,1}\\
 & = & \int \Big\{ f_{1,\bM(1,1,1)}(y | \bM(1,1,1)=\mathbf{m}_{0,0,0}, \boldsymbol{x}) f_{\bM(0,0,0),
  \bM(1,1,1)|\boldsymbol{X}=\boldsymbol{x}}(\mathbf{m}_{0,0,0},
\mathbf{m}_{1,1,1}) \Big\} \,d \mathbf{m}_{0,0,0} \,d\mathbf{m}_{1,1,1} \quad  \text{(A4)}\\
& \approx & \frac{1}{S}\sum_{i=1}^S f_{1,\bM(1,1,1)} (y|\bM(1,1,1) = \mathbf{m}_{0,0,0,i},\boldsymbol{x}),
\end{eqnarray*}
where $\mathbf{m}_{0,0,0,i}$ indicates the $i$-th sample of mediators,
$\mathbf{M}(0,0,0)$, and `A4' denotes Assumption 4. To compute
$f_{1,\bM(1,1,1)}(y|\bM(1,1,1)=\mathbf{m}_{0,0,0},\boldsymbol{x})$, we
note that this conditional distribution is equivalent to
\[\frac{f(Y(1;\bM(1,1,1))=y,
\bM(1,1,1)=\mathbf{m}_{0,0,0}, \boldsymbol{x})} { f(\bM(1,1,1)=\mathbf{m}_{0,0,0}, \boldsymbol{x})},\]
where all the terms are identifiable by Assumption 2 and the model
specifications in Section \ref{app:pp:obsmods}. All other distributions of
unobservable outcomes such as $f_{1,\bM(1,1,0)}(y|\boldsymbol{x})$,
$f_{1,\bM(1,0,1)}(y|\boldsymbol{x})$,
$f_{1,\bM(0,1,1)}(y|\boldsymbol{x})$,
$f_{1,\bM(1,0,0)}(y|\boldsymbol{x})$, $f_{1,\bM(0,1,0)}(y|\boldsymbol{x})$ and $f_{1,\bM(0,0,1)}(y|\boldsymbol{x})$ can be computed in the same manner.
\item Compute NDE, JNIE's, $\text{NIE}$'s,
\[\text{NDE} = E[Y(1, \bM(1,1,1)) - Y(0, \bM(0,0,0))]= \int\, y \,\big\{
f_{1,\bM(0,0,0)}(y|\boldsymbol{x}) -
f_{0,\bM(0,0,0)}(y|\boldsymbol{x}) \big\}\,dy\,dF_{\boldsymbol{X}}(\boldsymbol{x}),\]
\[\text{JNIE}_{123} = E[Y(1, \bM(1,1,1)) - Y(1, \bM(0,0,0))] = \int \,y \,\big\{
f_{1,\bM(1,1,1)}(y|\boldsymbol{x}) -
f_{1,\bM(0,0,0)}(y|\boldsymbol{x}) \big\}\,dy\,dF_{\boldsymbol{X}}(\boldsymbol{x}),\]
\[
\text{JNIE}_{12}   =  E[Y(1, \bM(1,1,1)) - Y(1, \bM(0,0,1))] =  \int \,y\,\big\{ f_{1,\bM(1,1,1)}(y|\boldsymbol{x}) -
f_{1,\bM(0,0,1)}(y|\boldsymbol{x}) \big\} \,dy\,dF_{\boldsymbol{X}}(\boldsymbol{x}),
\]
 and
\[\text{NIE}_1 = E[Y(1, \bM(1,1,1)) - Y(1, \bM(0,1,1))]= \int \,y\,\big\{
f_{1,\bM(1,1,1)}(y|\boldsymbol{x}) -
f_{1,\bM(0,1,1)}(y|\boldsymbol{x}) \big\}\,dy\, dF_{\boldsymbol{X}}(\boldsymbol{x}).\]
Other mediator specific effects, $\text{NIE}_2, \text{NIE}_3$, and
joint natural indirect effects of other pairs of mediators, $\text{JNIE}_{13}, \text{JNIE}_{23}$, are computed similarly.
\end{enumerate}
We iterate steps 1-3 $N$ times and estimate posterior means and
standard deviations of the effects. 


\subsection{Assessment of Overlap of Natural Indirect Effects}\label{app:pp:overlap}
A key benefit of our proposed methods relative to other approaches for multiple mediators is that we allow possible overlapping
between NIEs such that one can examine whether there is actually overlap between
NIEs or additivity of NIEs holds. We evaluate the
relationship between the joint effects $\text{JNIE}_{jk}$ and the
mediator-specific effects $\text{NIE}_1$, $\text{NIE}_2$,
$\text{NIE}_3$ through 
\begin{eqnarray*}
(\text{NIE}_1+\text{NIE}_2) - \text{JNIE}_{12}  & = & 0.001 (0.007), \\
(\text{NIE}_1+\text{NIE}_3) - \text{JNIE}_{13}  & = & -0.0003 (0.003), \\
(\text{NIE}_2+\text{NIE}_3) - \text{JNIE}_{23}  & = & 0.001 (0.008)
\end{eqnarray*}
which give no evidence of overlap between NIEs. That is, it appears as though the causal effect of an \SOTwo scrubber on ambient \PMTwo that is mediated through a given emission does not depend on the value of other emissions, i.e., there is no evidence of synergistic effects of joint reductions in multiple emissions.  




\newpage
\singlespacing
\bibliographystyle{biom}
\bibliography{HEIAccountabilityGrant}
\end{document}






